{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "import transformers\n",
    "from transformers import BertTokenizer, AutoModelForMaskedLM, BertConfig, BertForMaskedLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "checkpoint-570000  eval_results_mlm_wwm.txt  trainer_state.json\r\n",
      "checkpoint-580000  pytorch_model.bin\t     training_args.bin\r\n",
      "checkpoint-590000  runs\t\t\t     train_results.txt\r\n",
      "checkpoint-600000  special_tokens_map.json   vocab.txt\r\n",
      "checkpoint-610000  tokenizer_config.json\r\n",
      "config.json\t   tokenizer.json\r\n"
     ]
    }
   ],
   "source": [
    "!ls ./malay-cased-bert-base-mlm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = BertForMaskedLM.from_pretrained('./malay-cased-bert-base-mlm/checkpoint-610000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = BertTokenizer.from_pretrained('./malay-cased-bert-base')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/.local/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:39: FutureWarning: Pass token='bert-base-standard-bahasa-cased' as keyword args. From version 0.7 passing these as positional arguments will result in an error\n",
      "  warnings.warn(\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:79: FutureWarning: `name` and `organization` input arguments are deprecated and will be removed in v0.7. Pass `repo_id` instead.\n",
      "  warnings.warn(\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/huggingface_hub/hf_api.py:596: FutureWarning: `create_repo` now takes `token` as an optional positional argument. Be sure to adapt your code!\n",
      "  warnings.warn(\n",
      "Cloning https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased into local empty directory.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0198367b9c0f4118a575cca4a7591c48",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload file pytorch_model.bin:   0%|          | 4.00k/422M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "remote: Scanning LFS files for validity, may be slow...        \n",
      "remote: LFS file scan complete.        \n",
      "To https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased\n",
      "   13c596b..ef8fa11  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased/commit/ef8fa1193d5aa1107abcbfae19306d01a3333adf'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.push_to_hub('bert-base-standard-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "To https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased\n",
      "   ef8fa11..a98734c  main -> main\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'https://huggingface.co/mesolitica/bert-base-standard-bahasa-cased/commit/a98734cce021b1723ad91021a16fd65aa8715dac'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.push_to_hub('bert-base-standard-bahasa-cased', organization='mesolitica')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cd bert-base-standard-bahasa-cased && git pull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!cp -r malay-cased-bert-base-mlm/runs bert-base-standard-bahasa-cased\n",
    "!cd bert-base-standard-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline\n",
    "\n",
    "model = AutoModelForMaskedLM.from_pretrained('./malay-cased-bert-base-mlm/checkpoint-610000')\n",
    "tokenizer = AutoTokenizer.from_pretrained(\n",
    "    './malay-cased-bert-base',\n",
    "    do_lower_case = False,\n",
    ")\n",
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'score': 0.22050447762012482,\n",
       "  'token': 9687,\n",
       "  'token_str': 'ditolak',\n",
       "  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan ditolak.'},\n",
       " {'score': 0.09319040179252625,\n",
       "  'token': 2673,\n",
       "  'token_str': 'dibuat',\n",
       "  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan dibuat.'},\n",
       " {'score': 0.06558076292276382,\n",
       "  'token': 5730,\n",
       "  'token_str': 'dikemukakan',\n",
       "  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan dikemukakan.'},\n",
       " {'score': 0.04289316013455391,\n",
       "  'token': 3812,\n",
       "  'token_str': 'diterima',\n",
       "  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan diterima.'},\n",
       " {'score': 0.03790011256933212,\n",
       "  'token': 14732,\n",
       "  'token_str': 'dibatalkan',\n",
       "  'sequence': 'Permohonan Najib untuk dengar isu perlembagaan dibatalkan.'}]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('Permohonan Najib untuk dengar isu perlembagaan [MASK] .')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'score': 0.07294066250324249,\n",
       "  'token': 3569,\n",
       "  'token_str': 'faham',\n",
       "  'sequence': 'aku ni tak faham lah, kau sebenarnya nak faham.'},\n",
       " {'score': 0.05976617708802223,\n",
       "  'token': 1956,\n",
       "  'token_str': 'tahu',\n",
       "  'sequence': 'aku ni tak faham lah, kau sebenarnya nak tahu.'},\n",
       " {'score': 0.05713663622736931,\n",
       "  'token': 4248,\n",
       "  'token_str': 'berubah',\n",
       "  'sequence': 'aku ni tak faham lah, kau sebenarnya nak berubah.'},\n",
       " {'score': 0.028323516249656677,\n",
       "  'token': 2723,\n",
       "  'token_str': 'belajar',\n",
       "  'sequence': 'aku ni tak faham lah, kau sebenarnya nak belajar.'},\n",
       " {'score': 0.01882651261985302,\n",
       "  'token': 4330,\n",
       "  'token_str': 'tolong',\n",
       "  'sequence': 'aku ni tak faham lah, kau sebenarnya nak tolong.'}]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('aku ni tak faham lah, kau sebenarnya nak [MASK] .')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'score': 0.06390859186649323,\n",
       "  'token': 2423,\n",
       "  'token_str': 'je',\n",
       "  'sequence': 'sya nak makan je.'},\n",
       " {'score': 0.058914054185152054,\n",
       "  'token': 1485,\n",
       "  'token_str': 'apa',\n",
       "  'sequence': 'sya nak makan apa.'},\n",
       " {'score': 0.044890888035297394,\n",
       "  'token': 1617,\n",
       "  'token_str': 'banyak',\n",
       "  'sequence': 'sya nak makan banyak.'},\n",
       " {'score': 0.03408944234251976,\n",
       "  'token': 1473,\n",
       "  'token_str': 'lagi',\n",
       "  'sequence': 'sya nak makan lagi.'},\n",
       " {'score': 0.031660545617341995,\n",
       "  'token': 2124,\n",
       "  'token_str': 'sekali',\n",
       "  'sequence': 'sya nak makan sekali.'}]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('sya nak makan [MASK] .')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'score': 0.4603753983974457,\n",
       "  'token': 1722,\n",
       "  'token_str': 'buat',\n",
       "  'sequence': 'awak nak buat apa?'},\n",
       " {'score': 0.07606375217437744,\n",
       "  'token': 1868,\n",
       "  'token_str': 'makan',\n",
       "  'sequence': 'awak nak makan apa?'},\n",
       " {'score': 0.059539202600717545,\n",
       "  'token': 2532,\n",
       "  'token_str': 'cakap',\n",
       "  'sequence': 'awak nak cakap apa?'},\n",
       " {'score': 0.045212969183921814,\n",
       "  'token': 1619,\n",
       "  'token_str': 'kata',\n",
       "  'sequence': 'awak nak kata apa?'},\n",
       " {'score': 0.03384365886449814,\n",
       "  'token': 1801,\n",
       "  'token_str': 'jadi',\n",
       "  'sequence': 'awak nak jadi apa?'}]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('awak nak [MASK] apa?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
